Environment Setting
suppressMessages({
library(data.table)
library(dplyr)
library(plotly)
library(abn)
})Data Acquisition
var_path <- "data/20230311_tips.csv"
data <- var_path %>% fread()Pre-proccess
data <- data %>% mutate(
sex = sex %>% factor(., c("Female", "Male")),
smoker = smoker %>% factor(., c("No", "Yes")),
day = day %>% factor(., c("Thur", "Fri", "Sat", "Sun")),
time = time %>% factor(., c("Lunch", "Dinner"))
)Check Data
data %>% skimr::skim()| Name | Piped data |
| Number of rows | 244 |
| Number of columns | 7 |
| Key | NULL |
| _______________________ | |
| Column type frequency: | |
| factor | 4 |
| numeric | 3 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sex | 0 | 1 | FALSE | 2 | Mal: 157, Fem: 87 |
| smoker | 0 | 1 | FALSE | 2 | No: 151, Yes: 93 |
| day | 0 | 1 | FALSE | 4 | Sat: 87, Sun: 76, Thu: 62, Fri: 19 |
| time | 0 | 1 | FALSE | 2 | Din: 176, Lun: 68 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| total_bill | 0 | 1 | 19.79 | 8.90 | 3.07 | 13.35 | 17.8 | 24.13 | 50.81 | ▃▇▃▁▁ |
| tip | 0 | 1 | 3.00 | 1.38 | 1.00 | 2.00 | 2.9 | 3.56 | 10.00 | ▇▆▂▁▁ |
| size | 0 | 1 | 2.57 | 0.95 | 1.00 | 2.00 | 2.0 | 3.00 | 6.00 | ▇▂▂▁▁ |
Visualization
3D Scatter
tip ~ size + total_bill
data %>%
plot_ly(x = ~size, y = ~total_bill, z = ~tip, type = "scatter3d", mode = "markers")tip ~ size + total_bill + smoker
data %>%
plot_ly(x = ~size, y = ~total_bill, z = ~tip, color = ~smoker, type = "scatter3d", mode = "markers")Scatter with lm
tip ~ size + total_bill + day +smoker + sex
ggplotly(
data %>%
mutate(
day = day %>% paste("day =", .),
smoker = smoker %>% paste("smoker =", .)
) %>%
ggplot(aes(x = total_bill, y = tip, color = sex)) +
geom_smooth(method = "lm", se = F, size = .5, linetype = 2, formula = "y ~ x") +
geom_point(alpha = .8) +
facet_wrap(smoker ~ day, ncol = 4, scales = "free") +
scale_x_continuous(labels = scales::label_comma()) +
scale_y_continuous(labels = scales::label_comma()) +
colorspace::scale_color_discrete_diverging("Green-Orange") +
colorspace::scale_fill_discrete_diverging("Green-Orange") +
ggthemes::theme_fivethirtyeight() +
theme(legend.position = "bottom")
)Box plot
Scatter with lm: tip ~ total_bill + day + sex
ggplotly(data %>%
mutate(
day = day %>% paste("day =", .),
smoker = smoker %>% paste("smoker =", .)
) %>%
ggplot(aes(x = sex, y = tip, color = sex)) +
geom_boxplot() +
facet_wrap(~day, ncol = 4, scales = "free") +
scale_y_continuous(labels = scales::label_comma()) +
colorspace::scale_color_discrete_diverging("Green-Orange") +
colorspace::scale_fill_discrete_diverging("Green-Orange") +
ggthemes::theme_fivethirtyeight() +
theme(legend.position = "bottom"))Hypothesis testing
Structure Discover
data_abn <- data %>%
mutate(day = forcats::fct_recode(day, "Weekday" = "Thur", "Weekday" = "Fri", "Weekend" = "Sat", "Weekend" = "Sun")) %>%
mutate_at(.vars = c("total_bill", "tip"), ~ log(.)) %>%
data.frame()
data.dists <- list(
"total_bill" = "gaussian",
"tip" = "gaussian",
"sex" = "binomial",
"smoker" = "binomial",
"day" = "binomial",
"time" = "binomial",
"size" = "gaussian"
)
dag.banned <- matrix(0, ncol(data_abn), ncol(data_abn), dimnames = list(names(data.dists), names(data.dists)))
dag.banned["size", "total_bill"] <- 1
dag.banned["total_bill", "tip"] <- 1
dag.banned["size", "tip"] <- 1
data_abn %>%
buildScoreCache(data.df = ., data.dists = data.dists, max.parents = 1, dag.banned = dag.banned) %>%
mostProbable() %>%
fitAbn() %>%
plot()## Step1. completed max alpha_i(S) for all i and S
## Total sets g(S) to be evaluated over: 128
Regression
list_model <- c("total_bill", "size", "total_bill + size") %>%
paste("tip ~", .) %>%
c(., "total_bill ~ size") %>%
lapply(., as.formula) %>%
lapply(., function(x) {
glm(data = data_abn, formula = x)
})
suppressMessages({
list_model %>%
jtools::export_summs(.,
error_format = "[{conf.low}, {conf.high}]",
model.names = c("total bill", "size", "both", "total bill by size size")
)
})| total bill | size | both | total bill by size size | |
|---|---|---|---|---|
| (Intercept) | -0.95 *** | 0.44 *** | -0.89 *** | 2.19 *** |
| [-1.22, -0.68] | [0.30, 0.57] | [-1.16, -0.61] | [2.06, 2.32] | |
| total_bill | 0.68 *** | 0.60 *** | ||
| [0.58, 0.77] | [0.49, 0.72] | |||
| size | 0.22 *** | 0.06 * | 0.27 *** | |
| [0.17, 0.27] | [0.00, 0.11] | [0.23, 0.32] | ||
| N | 244 | 244 | 244 | 244 |
| AIC | 141.35 | 228.26 | 138.81 | 191.31 |
| BIC | 151.85 | 238.75 | 152.80 | 201.80 |
| Pseudo R2 | 0.67 | 0.34 | 0.68 | 0.50 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||||
Plot model fitness
total_bill
par(mfrow = c(2, 2))
list_model[[1]] %>% plot()both
par(mfrow = c(2, 2))
list_model[[3]] %>% plot()model_mediate = mediation::mediate(model.m = list_model[[4]], model.y = list_model[[3]],
treat = "size", mediator = "total_bill")
model_mediate%>%summary##
## Causal Mediation Analysis
##
## Quasi-Bayesian Confidence Intervals
##
## Estimate 95% CI Lower 95% CI Upper p-value
## ACME 0.16403 0.12468 0.21 <2e-16 ***
## ADE 0.05661 0.00808 0.11 0.026 *
## Total Effect 0.22064 0.16836 0.27 <2e-16 ***
## Prop. Mediated 0.74180 0.56467 0.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Sample Size Used: 244
##
##
## Simulations: 1000
model_mediate%>%plot